pigz -k -c -d Rattus_norvegicus.mRatBN7.2.dna_sm.toplevel.fa.gz | faOneRecord stdin MT > Rattus_norvegicus.mRatBN7.2.dna_sm.toplevel.mito_only.fa
1 Introduction
For more details, refer to ENCODE ATAC-Seq pipeline from GitHub, ENCODE ATAC-Seq pipeline from Google doc, ENCODE ChIP-Seq pipeline from GitHub, and ENCODE ChIP-Seq pipeline from Google doc.
2 Extract mitochondrial genome if present
3 Prepare Bowtie2 index of species/mitochondrial genome
#!/usr/bin/bash
ref_fa=/data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_atac_chip_seq/sugarglider.mito_only.fasta.gz
bowtie2_index_dir=/data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_atac_chip_seq
bowtie2_index_prefix=mito
bowtie2_n_threads=60
tmp_dir=/data/tmp
cd ${bowtie2_index_dir}
if [ ${ref_fa##*.} == "gz" ]
then
new_ref_fa=${tmp_dir}/$(basename ${ref_fa} .gz)
pigz -k -c -d ${ref_fa} > ${new_ref_fa}
else
new_ref_fa=${ref_fa}
fi
bowtie2-build --threads ${bowtie2_n_threads} -f ${new_ref_fa} ${bowtie2_index_prefix}
tar -cvf ${bowtie2_index_prefix}.bt2_index.tar *.bt2
rm *.bt2
if [ ${ref_fa##*.} == "gz" ]
then
rm ${new_ref_fa}
fi
4 Prepare transcription start sites
Note: GFF3 is 1-based coordinate system, and BED is 0-based coordinate system.
library(rtracklayer)
library(vroom)
library(tidyverse)
<- "/data/biodatabase/species/mRatBN7/genome/anno/Rattus_norvegicus.mRatBN7.2.111.gff3.gz"
gff_file <- "/data/biodatabase/species/mRatBN7/encode_references/bulk_atac_chip_seq/mRatBN7.tss.bed"
output_file <- "gene"
target
<- import(gff_file, format = "gff3") %>%
df as.data.frame() %>%
as_tibble()
<- df %>%
df filter(type %in% target) %>%
select(seqnames, start, end, strand) %>%
mutate(
tss_start = if_else(strand %in% c("+", ".", "*"), start - 1, end - 1),
tss_end = if_else(strand %in% c("+", ".", "*"), start, end)
%>%
) select(seqnames, tss_start, tss_end) %>%
arrange(seqnames, tss_start, tss_end) %>%
distinct()
vroom_write(df, file = output_file, col_names = FALSE, append = FALSE)
system2("/usr/bin/pigz", output_file)